%matplotlib inline
import os
import IPython.display as ipd
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as skl
import sklearn.utils, sklearn.preprocessing, sklearn.decomposition, sklearn.svm
import utils
sns.set(color_codes=True)
sns.set_context("notebook", font_scale=1)
plt.rcParams['figure.figsize'] = (17, 5)
tracks = pd.read_pickle("clean_data/track.pkl")
tracks.sample(3)
tracks.dtypes
tracks.columns
tracks.shape
dup_rows = tracks[tracks.duplicated()]
dup_rows.shape
tracks.isnull().sum()
plt.figure(figsize=(13,10))
sns.heatmap(tracks.corr(),annot=True)
plt.title("Correlation Heatmap")
plt.show()
tracks.genre_top.value_counts()
tracks.genre_top.value_counts().nlargest(40).plot(kind='bar', figsize=(10,5))
plt.title("Number of tracks by Genre")
plt.ylabel("Number of tracks")
plt.xlabel("Genre");
Data1= pd.get_dummies(tracks.genre_top)
tracks = pd.concat([tracks, Data1], 1)
tracks.shape
tracks.columns
sns.barplot(tracks.genre_top.value_counts().index, tracks.genre_top.value_counts(), order=list(tracks.genre_top.value_counts().index))
plt.title("Cantidad de Tracks segun Generos ")
plt.xlabel("Generos")
plt.ylabel("Tracks")
Correlacion entre variables
#Correlacion entre Variables de echones
lista_echones=['acousticness', 'danceability','energy', 'instrumentalness', 'liveness', 'speechiness', 'tempo','valence']
corr = tracks[lista_echones].corr()
plt.figure(figsize=(13,10))
sns.heatmap(corr,annot=True)
plt.title("Correlation Heatmap Echonest")
plt.show()
corr
#Correlacion entre Generos
lista_generos=['Blues', 'Classical', 'Electronic', 'Experimental', 'Folk',
'Hip-Hop', 'Instrumental', 'International', 'Jazz',
'Old-Time / Historic', 'Pop', 'Rock']
corr = tracks[lista_generos].corr()
plt.figure(figsize=(13,10))
sns.heatmap(corr,annot=True, cmap='Blues')
plt.title("Correlation Heatmap")
plt.show()
corr
#Funcion de heatmap por genero
def plot_genero(df, genero):
plt.figure(figsize=(2,5))
sns.heatmap(df.corr()[[genero]],annot=True)
plt.title("Correlation Heatmap Echonest by GENERO: "+genero)
plt.show()
lista_echones=['acousticness', 'danceability','energy', 'instrumentalness', 'liveness', 'speechiness', 'tempo','valence']
for x in lista_generos:
df = tracks[lista_echones +[x]]
plot_genero(df, x)
import time
import datetime
from datetime import date
from bokeh.plotting import figure
from bokeh.io import show, output_notebook
from bokeh.palettes import Category20
from bokeh.models import HoverTool
keys = tracks["genre_top"].unique()
colors = [Category20[20][i] for i in range(20) ]
colormap = dict(zip(keys, colors))
colormap
tracks['date_created'].plot()
plt.figure()
ax = sns.boxplot(x="genre_top", y="duration", data=tracks, fliersize=2)
plt.title("Duracion segun Genero ")
plt.xlabel("Generos")
plt.ylabel("Duracion")
def boxplot_echonest (df, echonest):
ax = sns.boxplot(x="genre_top", y=echonest, data= df, fliersize=2)
plt.title("Genero por "+ echonest)
plt.xlabel("Generos")
plt.ylabel(echonest)
plt.show()
for x in lista_echones:
boxplot_echonest (tracks, x)
tracks.genre_top.value_counts().index[0:5]
features = ['acousticness', 'danceability', 'energy', 'instrumentalness', 'liveness', 'speechiness', 'valence', 'genre_top']
full_features = ['acousticness', 'danceability', 'energy', 'instrumentalness', 'liveness', 'speechiness', 'tempo', 'valence']
def compare_two_genres(df, genre_1, genre_2):
_features = ['acousticness', 'danceability', 'energy', 'instrumentalness', 'liveness', 'speechiness', 'tempo', 'valence', 'genre_top']
compare_mask = (df.genre_top == genre_1) | (df.genre_top == genre_2)
_df = df[compare_mask][_features]
sns.pairplot(_df,hue='genre_top', plot_kws = {'alpha': 0.6, 's': 80, 'edgecolor': 'k'});
compare_two_genres(tracks, 'Hip-Hop', 'Pop')
def plot_features_profile(df, genre):
angles = np.linspace(0, 2*np.pi, len(features), endpoint=False)
labels = features
stats = df[df.genre_top == genre][features].mean().values
stats=np.concatenate((stats,[stats[0]]))
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111, polar=True)
ax.plot(angles, stats, 'o-', linewidth=2)
ax.fill(angles, stats, alpha=0.25)
ax.set_thetagrids(angles * 180/np.pi, labels)
ax.set_title( genre + " Profile")
ax.grid(True)
plt.show()
for x in lista_generos:
plot_features_profile(tracks, x)
# por localidad - ver dist de genero
# por genero - ver localidad
tracks.groupby('genre_top')[features].mean()
top_localidades=tracks.location.value_counts().head(10)
top_localidades
lista_top_localidades = top_localidades.index.tolist()
mask_localidades = tracks.location.isin(lista_top_localidades)
tracks[mask_localidades].groupby('location')[lista_generos].sum().plot(kind='bar', figsize=(20,15))